{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Manifold Learning - How to improve Classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Toy example how to make use of manifold learning / dimensionality reduction for feature engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Automatically created module for IPython interactive environment\n"
     ]
    }
   ],
   "source": [
    "print(__doc__)\n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "from matplotlib.ticker import NullFormatter\n",
    "%matplotlib inline\n",
    "from sklearn import manifold, datasets\n",
    "from sklearn import random_projection\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "from sklearn import neighbors, linear_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KNN score: 1.000000\n",
      "LogisticRegression score: 1.000000\n"
     ]
    }
   ],
   "source": [
    "digits = datasets.load_digits(n_class=2)\n",
    "X_digits = digits.data\n",
    "y_digits = digits.target\n",
    "n_samples, n_features = X_digits.shape\n",
    "\n",
    "X_train = X_digits[:int(.9 * n_samples)]\n",
    "y_train = y_digits[:int(.9 * n_samples)]\n",
    "X_test = X_digits[int(.9 * n_samples):]\n",
    "y_test = y_digits[int(.9 * n_samples):]\n",
    "\n",
    "# 64 features\n",
    "knn = neighbors.KNeighborsClassifier()\n",
    "logistic = linear_model.LogisticRegression()\n",
    "\n",
    "print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))\n",
    "print('LogisticRegression score: %f'\n",
    "      % logistic.fit(X_train, y_train).score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Try out different manifold-learned features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_additional_features = 22\n",
    "#transformer = random_projection.SparseRandomProjection(n_additional_features, random_state=42)\n",
    "#transformer = random_projection.GaussianRandomProjection(n_additional_features, random_state=0)\n",
    "#transformer = manifold.MDS(n_additional_features, max_iter=300, n_init=3)\n",
    "transformer = manifold.TSNE(n_components=3, init='pca', random_state=42)\n",
    "#transformer = PCA(n_additional_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KNN score: 0.944444\n",
      "LogisticRegression score: 0.916667\n"
     ]
    }
   ],
   "source": [
    "New_features_train = transformer.fit_transform(X_train)\n",
    "X_train2 = np.c_[X_train, New_features_train]\n",
    "\n",
    "# Calculate transformation for test data by hand (there is no fit function for the transformation)\n",
    "Handmade_fit_transform = np.linalg.lstsq(X_train, New_features_train, rcond=None)\n",
    "Transformation_matrix = Handmade_fit_transform[0]\n",
    "\n",
    "# Calculate new features \n",
    "X_test2 = np.c_[X_test, np.matmul(X_test, Transformation_matrix)]\n",
    "\n",
    "print('KNN score: %f' % knn.fit(X_train2, y_train).score(X_test2, y_test))\n",
    "print('LogisticRegression score: %f'\n",
    "      % logistic.fit(X_train2, y_train).score(X_test2, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KNN score: 0.963889\n",
      "LogisticRegression score: 0.897222\n"
     ]
    }
   ],
   "source": [
    "print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))\n",
    "print('LogisticRegression score: %f'\n",
    "      % logistic.fit(X_train, y_train).score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Now let's try not to cheat - getting rid of fit_transform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = X_train\n",
    "b = Y_digits_train\n",
    "#x = np.linalg.lstsq(a, b, rcond=None)\n",
    "# Calculate transformation matrix from training data only\n",
    "x = np.matmul(np.linalg.pinv(a),b)\n",
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "b.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "digits2 = datasets.load_digits(n_class=6)\n",
    "#X_digits2 = digits2.data\n",
    "#y_digits2 = digits2.target\n",
    "\n",
    "Y_digits = transformer.fit_transform(X_digits)\n",
    "X_digits2 = np.c_[X_digits, Y_digits]\n",
    "#X_digits2 = Y_digits\n",
    "#X_digits2 = X_digits2[:,64:66]\n",
    "\n",
    "X_train2 = X_digits2[:int(.9 * n_samples)]\n",
    "y_train2 = y_digits[:int(.9 * n_samples)]\n",
    "X_test2 = X_digits2[int(.9 * n_samples):]\n",
    "y_test2 = y_digits[int(.9 * n_samples):]\n",
    "\n",
    "print('KNN score: %f' % knn.fit(X_train2, y_train2).score(X_test2, y_test2))\n",
    "print('LogisticRegression score: %f'\n",
    "      % logistic.fit(X_train2, y_train2).score(X_test2, y_test2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.array([[3,1,2],[2,1,2]])\n",
    "b = np.array([9,8])\n",
    "x = np.linalg.lstsq(a, b)\n",
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[  7.535404 , -30.34967  ,  -8.010539 ],\n",
       "       [  3.6932597,  21.434452 ,  15.182379 ],\n",
       "       [ -0.8953111,  13.490789 ,   8.821477 ],\n",
       "       ...,\n",
       "       [-14.937363 ,   5.912628 ,  10.598797 ],\n",
       "       [  4.0065293,  27.213686 ,  13.419299 ],\n",
       "       [  1.5898446,   6.8484554, -23.656971 ]], dtype=float32)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Add_features = transformer.fit_transform(X_train)\n",
    "Add_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(64, 3)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(64, 974)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.linalg.pinv(X_train).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "Handmade_fit_transform = np.linalg.lstsq(X_train, Add_features,rcond=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "Transformation_Matrix = Handmade_fit_transform[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Add_features_test = np.matmul(X_test,Transformation_Matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 11.244668 , -38.748028 ],\n",
       "       [  8.261975 ,  32.655094 ],\n",
       "       [ -4.286929 ,  29.210676 ],\n",
       "       ...,\n",
       "       [-22.301434 ,   1.9374135],\n",
       "       [  5.9737096,  38.21313  ],\n",
       "       [  2.414325 ,   1.5860955]], dtype=float32)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "New_features_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
